Dual CRISPR Screen Analysis

Step 3: Construct Counting

Amanda Birmingham, CCBB, UCSD (abirmingham@ucsd.edu)

Instructions

To run this notebook reproducibly, follow these steps:

  1. Click Kernel > Restart & Clear Output
  2. When prompted, click the red Restart & clear all outputs button
  3. Fill in the values for your analysis for each of the variables in the Input Parameters section
  4. Click Cell > Run All

Input Parameters


In [ ]:
g_num_processors = 3
g_filtered_fastqs_dir = '~/dual_crispr/test_data/test_set_3'
g_library_fp = '~/dual_crispr/library_definitions/test_library.txt'
g_len_of_seq_to_match = 19
g_num_allowed_mismatches = 1
g_fastq_counts_run_prefix = ''
g_fastq_counts_dir = '~/dual_crispr/test_outputs/test_set_3'

Automated Set-Up


In [ ]:
import inspect

import ccbb_pyutils.analysis_run_prefixes as ns_runs
import ccbb_pyutils.files_and_paths as ns_files
import ccbb_pyutils.notebook_logging as ns_logs


def describe_var_list(input_var_name_list):
    description_list =  ["{0}: {1}\n".format(name, eval(name)) for name in input_var_name_list]
    return "".join(description_list)


ns_logs.set_stdout_info_logger()

In [ ]:
g_filtered_fastqs_dir = ns_files.expand_path(g_filtered_fastqs_dir)
g_library_fp = ns_files.expand_path(g_library_fp)
g_fastq_counts_run_prefix = ns_runs.check_or_set(g_fastq_counts_run_prefix, ns_runs.generate_run_prefix())
g_fastq_counts_dir = ns_files.expand_path(ns_runs.check_or_set(g_fastq_counts_dir, g_filtered_fastqs_dir))

print(describe_var_list(['g_filtered_fastqs_dir', 'g_library_fp','g_fastq_counts_run_prefix', 'g_fastq_counts_dir']))
ns_files.verify_or_make_dir(g_fastq_counts_dir)

Construct Counting Functions


In [ ]:
import dual_crispr.count_filterer as ns_filter
print(inspect.getsource(ns_filter.get_filtered_file_suffix))

In [ ]:
import dual_crispr.construct_file_extracter as ns_extractor
print(inspect.getsource(ns_extractor))

In [ ]:
import dual_crispr.grna_position_matcher as ns_matcher
print(inspect.getsource(ns_matcher))

In [ ]:
import dual_crispr.construct_counter as ns_counter
print(inspect.getsource(ns_counter))

In [ ]:
def count_constructs_for_one_fastq_pair(curr_base, run_prefix, seq_len, num_allowed_mismatches, constructs_fp, 
                                        output_dir, fw_fastq_fp, rv_fastq_fp):
    construct_names, grna_name_seq_pairs = ns_extractor.extract_construct_and_grna_info(constructs_fp)
    trimmed_grna_name_seq_pairs = ns_extractor.trim_probes(grna_name_seq_pairs, seq_len)
    # Note: currently same value (num_allowed_mismatches) is being used for number of mismatches allowed in forward
    # read and number of mismatches allowed in reverse read, but this can be altered if desired
    grna_matcher = ns_matcher.GrnaPositionMatcher(trimmed_grna_name_seq_pairs, seq_len, num_allowed_mismatches, 
                                       num_allowed_mismatches)    
    output_fp = ns_files.build_multipart_fp(output_dir, [curr_base, run_prefix, ns_counter.get_counts_file_suffix()])
    ns_counter.generate_construct_counts(grna_matcher, construct_names, output_fp, fw_fastq_fp, rv_fastq_fp)

In [ ]:
import ccbb_pyutils.parallel_process_fastqs as ns_parallel

g_parallel_results = ns_parallel.parallel_process_paired_reads(g_filtered_fastqs_dir, 
    ns_filter.get_filtered_file_suffix(), g_num_processors, count_constructs_for_one_fastq_pair, 
    [g_fastq_counts_run_prefix, g_len_of_seq_to_match, g_num_allowed_mismatches, g_library_fp,
     g_fastq_counts_dir], True)

In [ ]:
print(ns_parallel.concatenate_parallel_results(g_parallel_results))

In [ ]:
print(ns_files.check_file_presence(g_fastq_counts_dir, g_fastq_counts_run_prefix, ns_counter.get_counts_file_suffix(),
                                  check_failure_msg="Construct counting failed to produce count file(s)."))